Git doesn't support in text .ipynb hyperlinks, to use them checkout the nbviewer for this notebook
This notebook sql queries our video_stats table and creates a new potential sql view for later on.
All data is based on playlist of 306 videos, titled Important Videos, that went viral around 2012.
A lot of these videos are weird memes and some are actually funny but there is a range for sure
from datetime import datetime
import pandas as pd
import math
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from psycopg2 import sql, connect
import psycopg2
import pandas as pd
import plotly.express as px
from IPython.display import IFrame
#import credentials (database host address) from your youtube_config.py
import sys
sys.path.insert(1, '../../')
from youtube_config import host
- Both Make clickable HTML links for our video urls so we can take breaks from our research to see if we think the videos we're researching deserve their notoriety
- This functions lets us search any string in the video title and returns it's most recent stats.
- Think of this function like a google search but just for this specific set of videos
def make_clickable(val):
"""
Makes hyperlinks into clickable html
val: input url
"""
return '<a href="{}">{}</a>'.format(val,val)
def make_df_url(df):
"""
Returns data frame with clickable 'pl_url' columns links
df: input data frame
"""
return df.style.format(make_clickable, subset='pl_url')
def findVideo(input_str=None):
"""
Search any string within video title. Returns most recent stats of videos with search string
in the title as a dataframe
input_str: input video title string
"""
input_str = input_str.lower()
returndf = sql_df[sql_df['title'].str.lower().str.contains(input_str)].sort_values(
by='datetime', ascending=False).drop_duplicates('title').sort_values(by="title")
return returndf.style.format(make_clickable, subset='pl_url')
db_name = "youtube_test"
user = "postgres"
host = host
password = "mypw"
try:
# declare a new PostgreSQL connection object
conn = connect(
dbname = db_name,
user = user,
host = host,
password = password
)
# print the connection if successful
print ("psycopg2 connection:\n", str(conn).split(' ')[5:8], "host=host", str(conn).split(',')[1] )
cur = conn.cursor()
except Exception as err:
print ("psycopg2 connect() ERROR:", err)
conn = None
This query can be difficult to read outside of a native sql text editor, so I outline the structure here:
Subquery to pull our newly defined stats
(
CTE used to generate our first set of new stats and values to confirm we create clean data
The CTE generates deltas based on current stats of the video minus the second most recent record
EX: viewcount_delta = viewcount_t1 - viewcount_t0
The deltas are made by ordering records OVER (ORDER BY title, vid_date, vid_TIME) and subtracting the
previous record with the LAG() window function
EX: viewcount_delta = viewcount - LAG(viewcount) OVER (ORDER BY title, vid_date, vid_TIME)
LAG previous title to check if it matches current title. Will clean data
WHERE title = last_title.
CASE WHEN record title matches previous record title
When condition makes sure deltas are only calculated between the same column title.
Prevents edge cases where the previous title rolls over to a new title.
THEN subtract stats to make deltas.
END
JOIN on another table that has title because our curret table does not.
)
SELECT new deltas FROM CTE and create time_delta_min and view_per_min stats
WHERE title = last_title ensures all deltas were calc'd between the same title.
AND view_delta IS NOT NULL filters out edge cases where title != last_title
try:
cur.execute("""
--subquery to generate final view of stats
SELECT title, pl_url, position, videopublishedat, vid_date,
viewcount, last_view, view_delta, view_per_min,
datetime, last_time, time_delta, time_delta_min,
commentcount, commentcount_delta,
likecount, likecount_delta,
dislikecount, dislikecount_delta
FROM
(
--CTE used to generate stats from viewcount, likecount, etc. The CTE is transforming the raw data into more powerful data
WITH cte AS (
SELECT viewcount, likecount, dislikecount, commentcount,
vid_date, vid_time, title, playlists.position, pl_url, videopublishedat,
--check last_title matches current title to ensure data is clean when calc'ing deltas with LAG()
LAG(title) OVER (ORDER BY title, vid_date, vid_TIME) AS last_title,
--check last_view count to ensure data is clean when calc'ing view_delta with LAG()
LAG(viewcount) OVER (ORDER BY title, vid_date, vid_TIME) AS last_view,
--check last_time date to ensure data is clean when calc'ing time_delta with LAG()
LAG(vid_date + vid_time) OVER (ORDER BY title, vid_date, vid_TIME ) AS last_time,
--create datetime column combining vid_date and vid_time cols
(vid_date + vid_time) AS datetime,
--CASE statements to calculate deltas of each statistic
CASE
/*When title = LAG(title) ensures the following delta corresponds to videos with the same title
This is needed because the records ORDER BY title, vid_date, vid_TIME
So when the ordered records proceed to the next title, you don't care about the delta for A_title_views-B_title_views.
*/
When title = LAG(title) OVER (ORDER BY title, vid_date, vid_time)
--Then calculate delta over stat
THEN viewcount - LAG(viewcount) OVER (ORDER BY title, vid_date, vid_time)
END AS view_delta,
CASE
WHEN title = LAG(title) OVER (ORDER BY title, vid_date, vid_time)
THEN (vid_date + vid_time) - LAG(vid_date + vid_time) OVER (ORDER BY title, vid_date, vid_time)
END AS time_delta,
CASE
WHEN title = LAG(title) OVER (ORDER BY title, vid_date, vid_time)
THEN (commentcount) - LAG(commentcount) OVER (ORDER BY title, vid_date, vid_time)
END AS commentcount_delta,
CASE
WHEN title = LAG(title) OVER (ORDER BY title, vid_date, vid_time)
THEN (likecount) - LAG(likecount) OVER (ORDER BY title, vid_date, vid_time)
END AS likecount_delta,
CASE
WHEN title = LAG(title) OVER (ORDER BY title, vid_date, vid_time)
THEN (dislikecount) - LAG(dislikecount) OVER (ORDER BY title, vid_date, vid_time)
END AS dislikecount_delta
FROM video_stats
INNER JOIN playlists ON playlists.resourceid=video_stats.resourceid
)
--select * from CTE to do final transformations
SELECT *,
--calculate time_delta_min to determine watch rate as view_per_min
ROUND(CAST(EXTRACT(EPOCH FROM time_delta::INTERVAL)/60 AS NUMERIC), 2) AS time_delta_min,
ROUND(CAST(view_delta / (EXTRACT(EPOCH FROM time_delta::INTERVAL)/60) AS NUMERIC),2) as view_per_min
FROM cte
--where title = last_title verifies all deltas calc'd are a difference from the same title. Keeps data clean if anything slips through.
WHERE title = last_title
/*where view_delta IS NOT NULL removes all records where a delta wasnt calculated due to title != LAG(title).
This occurs ordered records proceed to the next title, so it results in NULL for A_title_views-B_title_views in our cte CASE statements
*/
AND view_delta IS NOT NULL
) temptable
""")
sql_return = cur.fetchall()
columns = [column[0] for column in cur.description]
sql_df = pd.DataFrame(sql_return, columns = columns)
except Exception as err:
print ("psycopg2 connect() ERROR:", err)
sql_df['hour'] = sql_df['datetime'].dt.hour
sql_df['day'] = sql_df['datetime'].dt.day_name()
sql_df['day_num'] = sql_df['datetime'].dt.weekday
sql_df['view_per_min'] = sql_df['view_per_min'].astype(float)
Value of view is actually defined by engagement with the ads, but businessinsider assumes 500USD/100,000 views
The top video is HEYYEYAAEYAAAEYAEYAA with 171M views, ~855,000USD in revenue
The links to both videos are in the df below the graph
top_views = sql_df[sql_df['datetime'] == sql_df.sort_values(
by='datetime', ascending=False).loc[0,'datetime']].sort_values(
by='viewcount', ascending=False)
fig = px.scatter(top_views, x="position", y="viewcount", color='title')
fig.update_layout(showlegend=False)
fig.show()
make_df_url(top_views[:2])
avg_views_day = sql_df.groupby(by=['vid_date','title']).mean().reset_index()
fig = px.line(avg_views_day, x="vid_date", y="view_per_min", color='title')
fig.update_layout(title="Views/Min of Title by Day",
legend=dict(
title="Legend",
yanchor="top",
y=-0.7,
xanchor="center",
x=0.5
))
fig.show()
findVideo("Have you ever")
fig = px.line(sql_df.groupby(by='hour').mean(), x=sql_df.groupby(by='hour').mean().index,
y=['view_per_min'])
fig.update_layout(
title="Views/Min by Hour of Day",
xaxis_title="hour of day",
yaxis_title="view per min",
legend_title="Legend",
)
fig.show()
sql_df[(sql_df['hour']!=16) & (sql_df['hour']!=18)].groupby(by='hour').mean().reset_index()
clean_df = sql_df[sql_df['hour'] % 2 > 0]
fig = px.line(clean_df.groupby(by='hour').mean().reset_index(),
x='hour',
y=['view_per_min'])
fig.update_layout(
title="Views/Min by Hour of Day",
xaxis_title="hour of day",
yaxis_title="view per min",
legend_title="Legend",
)
fig.show()
fig = px.line(clean_df.groupby(by='day').mean().reset_index().sort_values(by='day_num'),
x='day',
y='view_per_min')
fig.show()
fig.write_html("./viz_img/scatter_viewcnt.html")
IFrame(src="./viz_img/scatter_viewcnt.html", width=1000, height=600)
fig = px.line(clean_df.groupby(by=['day','hour']).mean().reset_index(),
x='hour',
y=['view_per_min'], color='day')
fig.show()
cols = ['title', 'view_per_min', 'viewcount', 'last_view']
fig = px.violin(clean_df[clean_df['view_per_min']<10].sort_values(by='day_num'),
y="view_per_min", x="day", color="day",
hover_data=sql_df[cols])
fig.show()
clean_df[clean_df['view_per_min']<0][:5]